Read the data

Read the dataset used in this assignment. Pay attention to the extension of the datafile.

data = read_xlsx('C:/Users/Deanne/Desktop/Term 4/SUBJECTS/STATS PROGRAMMING/assignment_3_dataset.xlsx')

Correct coding errors

If you find values in the dataset during the EDA, that are not correct based on the provided descriptions of the variables of the dataset please correct them here.

data = data %>%
  mutate(sex = dplyr::recode(sex, "male" = 1, "female" = 2, "woman" = 2)) %>%
  mutate(pain = case_when(pain > 10 ~ 10, .default = pain)) %>% 
  mutate(mindfulness = case_when(mindfulness > 6 ~ 6, .default = mindfulness)) %>%
  mutate(cortisol = round((cortisol_serum + cortisol_saliva) / 2, digits = 2)) 
data
## # A tibble: 160 × 13
##    ID     pain   sex   age STAI_trait pain_cat cortisol_serum cortisol_saliva
##    <chr> <dbl> <dbl> <dbl>      <dbl>    <dbl>          <dbl>           <dbl>
##  1 ID_1      5     2    38         39       25           4.67            4.78
##  2 ID_2      4     2    36         46       31           6.01            6.71
##  3 ID_3      5     2    51         49       32           5.18            4.75
##  4 ID_4      7     2    39         48       41           6.65            6.68
##  5 ID_5      4     1    48         36       26           2.95            3.2 
##  6 ID_6      6     2    45         37       28           4.32            4.36
##  7 ID_7      4     1    45         34       22           5.32            5.19
##  8 ID_8      5     2    35         39       32           5.26            5.53
##  9 ID_9      4     1    39         31       27           3.36            3.3 
## 10 ID_10     5     2    33         38       36           5.96            5.78
## # ℹ 150 more rows
## # ℹ 5 more variables: mindfulness <dbl>, weight <dbl>, IQ <dbl>,
## #   household_income <dbl>, cortisol <dbl>
N = nrow(data)
N
## [1] 160

In order to test the more complex model for outliers and to test the assumptions first build the model.

# data filtered to relevant variables only
simple_data = data %>% select(sex, age, pain)

# some plotting
simple_data_plot = plot_ly(data = simple_data, x=~sex, y=~age, z=~pain, type="scatter3d") %>%
  layout(showlegend = FALSE)

simple_data_plot
# create a simple linear regression model
simple_model = lm(pain ~ sex + age, data = simple_data)

summary(simple_model)
## 
## Call:
## lm(formula = pain ~ sex + age, data = simple_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7245 -1.0069  0.0985  0.9990  5.0985 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.61249    1.05866   8.135 1.17e-13 ***
## sex         -0.04122    0.24019  -0.172 0.863972    
## age         -0.08850    0.02386  -3.710 0.000287 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.516 on 157 degrees of freedom
## Multiple R-squared:  0.0806, Adjusted R-squared:  0.06888 
## F-statistic: 6.881 on 2 and 157 DF,  p-value: 0.001365

Check for outlier values in the model.

# add Cook's distance
simple_data = simple_data %>% mutate(cooks_d = cooks.distance(simple_model))

# threshold for outlier detection
threshold = 4/N
threshold
## [1] 0.025
# show outliers
simple_data %>% filter(cooks_d >= threshold)
## # A tibble: 7 × 4
##     sex   age  pain cooks_d
##   <dbl> <dbl> <dbl>   <dbl>
## 1     2    43     1  0.0277
## 2     1    46     8  0.0360
## 3     2    33     3  0.0271
## 4     2    44     8  0.0251
## 5     2    43     1  0.0277
## 6     2    44     1  0.0293
## 7     2    41    10  0.0462

Check the normality assumption.

hist(simple_data$age)

shapiro.test(simple_data$age) 
## 
##  Shapiro-Wilk normality test
## 
## data:  simple_data$age
## W = 0.99013, p-value = 0.3293
hist(simple_data$sex)

shapiro.test(simple_data$sex) 
## 
##  Shapiro-Wilk normality test
## 
## data:  simple_data$sex
## W = 0.6355, p-value < 2.2e-16

Check the linearity assumption.

cor(simple_data$age, simple_data$pain)
## [1] -0.2835908
cor(simple_data$sex, simple_data$pain) 
## [1] -0.002197505

Check the homoscedasticty assumption (homogeneity of variance).

ncvTest(simple_model) 
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 2.513787, Df = 1, p = 0.11285

Multicollinearity assumption

vif(simple_model) 
##      sex      age 
## 1.001486 1.001486

If based on the assumption tests you decide to drop a predictor variable you should do that here. Create your updated model.

# data filtered to relevant variables only
complex_data = data %>% select(sex, age, STAI_trait, pain_cat, mindfulness, cortisol, pain)

# create a simple linear regression model
complex_model = lm(pain ~ sex + age + STAI_trait + pain_cat + mindfulness + cortisol, data = complex_data)

summary(complex_model)
## 
## Call:
## lm(formula = pain ~ sex + age + STAI_trait + pain_cat + mindfulness + 
##     cortisol, data = complex_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2112 -0.8343 -0.0343  0.7959  4.6105 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.69609    1.79805   0.943  0.34702    
## sex         -0.27704    0.21870  -1.267  0.20715    
## age         -0.02492    0.02459  -1.013  0.31253    
## STAI_trait  -0.01974    0.02836  -0.696  0.48745    
## pain_cat     0.09033    0.02916   3.097  0.00232 ** 
## mindfulness -0.11075    0.12979  -0.853  0.39482    
## cortisol     0.63331    0.13387   4.731 5.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.292 on 153 degrees of freedom
## Multiple R-squared:  0.3497, Adjusted R-squared:  0.3242 
## F-statistic: 13.71 on 6 and 153 DF,  p-value: 1.977e-12
complex_data = complex_data %>% mutate(cooks_d = cooks.distance(complex_model))

complex_data %>% filter(cooks_d >= threshold)
## # A tibble: 9 × 8
##     sex   age STAI_trait pain_cat mindfulness cortisol  pain cooks_d
##   <dbl> <dbl>      <dbl>    <dbl>       <dbl>    <dbl> <dbl>   <dbl>
## 1     2    44         47       28        4.01     5.43     2  0.0259
## 2     1    46         37       24        3.1      6.58     8  0.0569
## 3     2    38         38       21        3.67     5.18     7  0.0338
## 4     2    43         50       26        3.96     3.99     1  0.0623
## 5     1    37         41       31        1        6.6      4  0.0551
## 6     1    39         36       39        4.34     3.79     7  0.0310
## 7     2    47         41       24        1.28     4.6      6  0.0275
## 8     2    41         44       30        3.3      5.99    10  0.0383
## 9     1    48         48       30        2.55     5.54     2  0.0470

Normality assumption

hist(complex_data$STAI_trait)

shapiro.test(complex_data$STAI_trait) 
## 
##  Shapiro-Wilk normality test
## 
## data:  complex_data$STAI_trait
## W = 0.98838, p-value = 0.2086
hist(complex_data$pain_cat)

shapiro.test(complex_data$pain_cat) 
## 
##  Shapiro-Wilk normality test
## 
## data:  complex_data$pain_cat
## W = 0.97803, p-value = 0.01189
hist(complex_data$mindfulness)

shapiro.test(complex_data$mindfulness) 
## 
##  Shapiro-Wilk normality test
## 
## data:  complex_data$mindfulness
## W = 0.99426, p-value = 0.7862
hist(complex_data$cortisol)

shapiro.test(complex_data$cortisol) 
## 
##  Shapiro-Wilk normality test
## 
## data:  complex_data$cortisol
## W = 0.9844, p-value = 0.0692

Linearity assumption

cor(complex_data$STAI_trait, complex_data$pain)
## [1] 0.2532565
cor(complex_data$pain_cat, complex_data$pain)
## [1] 0.4581928
cor(complex_data$mindfulness, complex_data$pain)
## [1] -0.2964476
cor(complex_data$cortisol, complex_data$pain)
## [1] 0.4998132

Homoscedasticty assumption (homogeneity of variance)

ncvTest(complex_model) 
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.1507902, Df = 1, p = 0.69778

Multicollinearity assumption

vif(complex_model)
##         sex         age  STAI_trait    pain_cat mindfulness    cortisol 
##    1.143878    1.466234    2.033713    1.850074    1.519735    1.633069

Compare the two models.

anova(simple_model, complex_model)
## Analysis of Variance Table
## 
## Model 1: pain ~ sex + age
## Model 2: pain ~ sex + age + STAI_trait + pain_cat + mindfulness + cortisol
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    157 360.86                                  
## 2    153 255.25  4    105.61 15.826 7.337e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1